Today's world living and depending on reviews, If you want to buy a product, if you want to visit a place, if you want stay in hotel you will first look at the reviews. There are many sources reviews are being written like websites, twitter, facebook and this reviews are in text format.
This is becoming very difficult to the Hotel managemnt on the areas where to improve as they do not have clear picture on reviews. It is also becoming unworth for the users to select a hotel which is good.
Coming to our businees, hotel owners must be quick to the keyboard in responding to users reviews coming from any of the platforms mentioned above. It shows the loyality of the hotel especially in case of negative reviews. In todays world processing of these reviews which are generating tremendoulsy is a tedious job.
I would like to help the hotel owners by giving them the Sentiment of the review whether it is Excellent, good or bad so that the hotel management can take necessary actions for the negative review. Also I would like to help hotel management by showing them in which group of hotels they fall in based on customer reviews.
Not only hotel management, I will help the users by recommending the best hotels which they didnt visit yet based on the similarities with other users.
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
import warnings
import keras
import nltk
import re
import codecs
# General
import numpy as np
import pandas as pd
import nltk
import random
import os
from os import path
from PIL import Image
# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS
# Set Plot Theme
sns.set_palette([
"#30a2da",
"#fc4f30",
"#e5ae38",
"#6d904f",
"#8b8b8b",
])
plt.style.use('fivethirtyeight')
# Pre-Processing
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
# Modeling
import statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.util import ngrams
from collections import Counter
from gensim.models import word2vec
# Warnings
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
'figure.figsize': (15, 5),
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)
os.chdir("D:/PhD")
hotels = pd.read_csv("Train-1554810061973.csv", parse_dates=[3])
hotels.head()
hotels.tail()
print ('The train data has {0} rows and {1} columns'.format(hotels.shape[0],hotels.shape[1]))
hotels.describe(include='all' )
pd.set_option('max_colwidth', 500)
hotels.tail(2)
dtype_df = hotels.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df
dtype_df.groupby("Column Type").aggregate('count').reset_index()
# Function to calculate missing values by column
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# Print some summary information
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
# Return the dataframe with missing information
return mis_val_table_ren_columns
missing_values_table(hotels)
unique_counts = pd.DataFrame.from_records([(col, hotels[col].nunique()) for col in hotels.columns],
columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
unique_counts
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.filterwarnings(action='once')
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
'legend.fontsize': med,
'figure.figsize': (16, 10),
'axes.labelsize': med,
'axes.titlesize': med,
'xtick.labelsize': med,
'ytick.labelsize': med,
'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline
# Version
print(mpl.__version__) #> 3.0.0
print(sns.__version__) #> 0.9.0
hotels['Sentiment'].value_counts().plot.bar(title="Frequency Ditribution of Sentiment Class")
dates = hotels['Date']
dates.head()
date_expanded = pd.DataFrame({"year": dates.dt.year,
"month": dates.dt.month,
"dayofmonth": dates.dt.day,
"dayofyear": dates.dt.dayofyear,
"week": dates.dt.week,
"weekofyear": dates.dt.weekofyear,
"dayofweek": dates.dt.dayofweek,
"weekday": dates.dt.weekday,
"quarter": dates.dt.quarter,
})
date_expanded.head()
date_expanded['quarter'] = date_expanded['quarter'].map({1: 'Quarter 1', 2: "Quarter 2", 3:'Quarter 3', 4: 'Quarter 4'})
date_expanded['month'] = date_expanded['month'].map({1: 'Jan', 2: "Feb", 3:'Mar', 4: 'Apr',5: 'May', 6: "Jun", 7:'Jul', 8: 'Aug',9: 'Sep', 10: "Oct", 11:'Nov', 12: 'Dec'})
date_expanded['dayofweek'] = date_expanded['dayofweek'].map({0: 'Monday', 1: "Tuesday", 2:'Wednesday', 4: 'Thursday',5: 'Friday', 6: "Saturday", 7:'Sunday'})
date_expanded.head()
date_expanded['year'].value_counts().plot.bar(title="No of Reviews given in each Year")
date_expanded['quarter'].value_counts().plot.bar(title= "Overall reviews for each Quarter")
date_expanded['month'].value_counts().plot.bar(title= "Overall reviews in each Month")
date_expanded['week'].value_counts().plot.bar(title= "Overall reviews in each week",figsize = (20,8))
date_expanded['dayofweek'].value_counts().plot.bar(title= "Overall reviews in dayofweek", figsize = (14,6))
date_expanded['dayofmonth'].value_counts().plot.bar(title="Overall no of reviews in each Day of the Month", figsize = (20,8), fontsize = 20)
date_new = date_expanded['dayofyear'].value_counts().head(20)
date_expanded['dayofyear'].value_counts().head(10).plot.bar(title= "Overall Highest Reviews in each dayofyear Sorted only 30 of 365",figsize = (20,8))
date_expanded['dayofyear'].value_counts().tail(10).plot.bar(title= "Overall Lowest Reviews in each dayofyear Sorted only 10 of 365",figsize = (18,8))
hotels['Hotelid'].value_counts().head(10).plot.bar(title= "Top 10 Hotels which got more Reviews",figsize = (16,8))
hotels['Hotelid'].value_counts().tail(10).plot.bar(title= "Top 10 Hotels which got Less Reviews",figsize = (16,8))
hotels = pd.concat([hotels, date_expanded], axis=1)
yearwise_reviews = pd.crosstab(index=hotels["Sentiment"],
columns=hotels["year"])
yearwise_reviews
quarterwise_reviews = pd.crosstab(index=hotels["Sentiment"],
columns=hotels["quarter"])
quarterwise_reviews
monthwise_reviews = pd.crosstab(index=hotels["Sentiment"],
columns=hotels["month"])
monthwise_reviews
pd.options.display.max_columns = None
weekwise_reviews = pd.crosstab(index=hotels["Sentiment"],
columns=hotels["week"])
weekwise_reviews
dayofmonthwise_reviews = pd.crosstab(index=hotels["Sentiment"],
columns=hotels["dayofmonth"])
dayofmonthwise_reviews
dayofyearwise_reviews = pd.crosstab(index=hotels["Sentiment"],
columns=hotels["dayofyear"])
dayofyearwise_reviews
hotels.head(1)
yearwise_reviews.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Year Wise Review Bar Graph" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
quarterwise_reviews.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Quarter Wise Review Bar Graph" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
monthwise_reviews.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Month Wise Review Bar Graph" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
quarter_month = pd.crosstab(index=hotels["Sentiment"],
columns=[hotels["quarter"],
hotels["month"]]
) # Include row and column totals
quarter_month
# Pulling out the third Quarter
quarter_month['Quarter 3']
high_month = quarter_month['Quarter 3']
high_month.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "3 rd Quarter Vs Month Wise Reviews" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews for 3rd Qaurter', fontsize=16)
low_month = quarter_month['Quarter 1']
low_month
low_month.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "1st Quarter Vs Month Wise Reviews" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews for 1st Qaurter', fontsize=16)
quarter_year = pd.crosstab(index=hotels["Sentiment"],
columns=[hotels["year"],
hotels["quarter"]]
) # Include row and column totals
quarter_year
quarter_2004 = quarter_year[2004]
quarter_2004 = quarter_year[2004]
quarter_2004.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Quarter Vs Year 2004 Reviews" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
quarter_2004 = quarter_year[2004]
quarter_2004.plot(kind="bar",
figsize=(6,5),
stacked=False, title= "Quarter Vs Year 2004 Reviews" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
plt.subplot
quarter_2005 = quarter_year[2005]
quarter_2005.plot(kind="bar",
figsize=(6,5),
stacked=False, title= "Quarter Vs Year 2005 Reviews" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
quarter_2006 = quarter_year[2006]
quarter_2006.plot(kind="bar",
figsize=(6,5),
stacked=False, title= "Quarter Vs Year 2006 Reviews" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
quarter_2007 = quarter_year[2007]
quarter_2004.plot(kind="bar",
figsize=(6,5),
stacked=False, title= "Quarter Vs Year 2007 Reviews" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
'figure.figsize': (15, 5),
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)
sns.countplot(x='month',data=hotels,facecolor=(0, 0, 0, 0),
linewidth=5,
edgecolor=sns.color_palette("dark", 3),
order = hotels['month'].value_counts().index)
hotels['dayofyear'].hist(bins=365)
# Heatmaps of Percentage Pivot Table
f, ax = plt.subplots(1,2,figsize=(16, 4), sharey=True)
sns.heatmap(pd.crosstab(hotels['year'], hotels["Sentiment"]),
annot=True, linewidths=.5, ax = ax[0],fmt='g', cmap="Reds",
cbar_kws={'label': 'Count'})
ax[0].set_title('Year Wise Review Count by Sentiment - Crosstab\nHeatmap Overall Count Distribution')
sns.heatmap(pd.crosstab(hotels['year'], hotels["Sentiment"], normalize=True).mul(100).round(0),
annot=True, linewidths=.5, ax=ax[1],fmt='g', cmap="Greens",
cbar_kws={'label': 'Percentage %'})
ax[1].set_title('Year Wise Review Count by Sentiment - Crosstab\nHeatmap Overall Percentage Distribution')
ax[1].set_ylabel('')
plt.tight_layout(pad=0)
plt.show()
# Heatmaps of Percentage Pivot Table
f, ax = plt.subplots(1,2,figsize=(16, 4), sharey=True)
sns.heatmap(pd.crosstab(hotels['year'], hotels["Sentiment"], normalize='columns').mul(100).round(0),
annot=True, linewidths=.5, ax=ax[0],fmt='g', cmap="Reds",
cbar_kws={'label': 'Percentage %'})
ax[0].set_title('Year Wise Review Count by Sentiment - Crosstab\nHeatmap % Distribution by Columns')
sns.heatmap(pd.crosstab(hotels['year'], hotels["Sentiment"], normalize='index').mul(100).round(0),
annot=True, linewidths=.5, ax=ax[1],fmt='g', cmap="Greens",
cbar_kws={'label': 'Percentage %'})
ax[1].set_title('Year Wise Review Count by Sentiment - Crosstab\nHeatmap % Distribution by Index')
ax[1].set_ylabel('')
plt.tight_layout(pad=0)
plt.show()
# Heatmaps of Percentage Pivot Table
f, ax = plt.subplots(1,2,figsize=(16, 4), sharey=True)
sns.heatmap(pd.crosstab(hotels['quarter'], hotels["Sentiment"]),
annot=True, linewidths=.5, ax = ax[0],fmt='g', cmap="Reds",
cbar_kws={'label': 'Count'})
ax[0].set_title('Quarter Wise Review Count by Sentiment - Crosstab\nHeatmap Overall Count Distribution')
sns.heatmap(pd.crosstab(hotels['quarter'], hotels["Sentiment"], normalize=True).mul(100).round(0),
annot=True, linewidths=.5, ax=ax[1],fmt='g', cmap="Greens",
cbar_kws={'label': 'Percentage %'})
ax[1].set_title('Quarter Wise Review Count by Sentiment - Crosstab\nHeatmap Overall Percentage Distribution')
ax[1].set_ylabel('')
plt.tight_layout(pad=0)
plt.show()
# Heatmaps of Percentage Pivot Table
f, ax = plt.subplots(1,2,figsize=(16, 4), sharey=True)
sns.heatmap(pd.crosstab(hotels['quarter'], hotels["Sentiment"], normalize='columns').mul(100).round(0),
annot=True, linewidths=.5, ax=ax[0],fmt='g', cmap="Reds",
cbar_kws={'label': 'Percentage %'})
ax[0].set_title('Quarter Wise Review Count by Sentiment - Crosstab\nHeatmap % Distribution by Columns')
sns.heatmap(pd.crosstab(hotels['quarter'], hotels["Sentiment"], normalize='index').mul(100).round(0),
annot=True, linewidths=.5, ax=ax[1],fmt='g', cmap="Greens",
cbar_kws={'label': 'Percentage %'})
ax[1].set_title('Quarter Wise Review Count by Sentiment - Crosstab\nHeatmap % Distribution by Index')
ax[1].set_ylabel('')
plt.tight_layout(pad=0)
plt.show()
sns.countplot(x='Hotelid',data=hotels,facecolor=(0, 0, 0, 0),
linewidth=5,
edgecolor=sns.color_palette("dark", 3),
order = hotels['Hotelid'].value_counts().head(10).index)
hotelid_sentiment = pd.crosstab(index=hotels["Sentiment"],
columns=[hotels["Hotelid"]]) # Include row and column totals
hotelid_sentiment
hotel_188 = hotelid_sentiment['hotel_188']
hotel_188
hotel_188.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Hotel_188 Sentiment Distribution" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
hotelid_year_sentiment = pd.crosstab(index=hotels["Sentiment"],
columns=[hotels["Hotelid"],hotels['year']]) # Include row and column totals
hotelid_year_sentiment
hotel_188_year = hotelid_year_sentiment['hotel_188']
hotel_188_year
hotel_188_year.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Hotel_188 Year Wise Sentiment Distribution" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
hotelid_quarter_sentiment = pd.crosstab(index=hotels["Sentiment"],
columns=[hotels["Hotelid"],hotels['quarter']]) # Include row and column totals
hotelid_quarter_sentiment
hotel_188_quarter = hotelid_quarter_sentiment['hotel_188']
hotel_188_quarter
hotel_188_quarter.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Hotel_188 Quarter Wise Sentiment Distribution" )
plt.xlabel('Sentiment', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
pd.set_option('max_colwidth', 500)
warnings.filterwarnings('ignore')
df = hotels
df[["Hotelid","Reviewid","reviewtext", "Sentiment"]].sample(2)
df = df[["Hotelid","Reviewid","reviewtext", "Sentiment"]]
df.describe()
def standardize_text(df, text_field):
df[text_field] = df[text_field].str.replace(r"http\S+", "")
df[text_field] = df[text_field].str.replace(r"http", "")
df[text_field] = df[text_field].str.replace(r"@\S+", "")
df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\!!\!!!\"\_\n]", " ")
df[text_field] = df[text_field].str.replace(r"@", "at")
df[text_field] = df[text_field].str.replace(r"$", "")
df[text_field] = df[text_field].str.lower()
return df
df = standardize_text(df, "reviewtext")
df.tail(2)
def remove_punctuation(text):
'''a function for removing punctuation'''
import string
# replacing the punctuations with no space,
# which in effect deletes the punctuation marks
translator = str.maketrans('', '', string.punctuation)
# return the text stripped of punctuation marks
return text.translate(translator)
df['reviewtext'] = df['reviewtext'].apply(remove_punctuation)
df.tail(2)
df.to_csv("df_clean_data.csv")
df.groupby("Sentiment").count()
text = " ".join(review for review in df.reviewtext)
print ("There are {} words in the combination of all review.".format(len(text)))
import string
wordcloud = WordCloud(background_color="Black").generate(text)
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Most Frequent Words in all the Review Text")
plt.axis("off")
plt.show()
excellent = " ".join(review for review in df[df["Sentiment"]=="excellent"].reviewtext)
good = " ".join(review for review in df[df["Sentiment"]=="good"].reviewtext)
bad = " ".join(review for review in df[df["Sentiment"]=="bad"].reviewtext)
wordcloud_excellent = WordCloud(background_color="white").generate(excellent)
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud_excellent, interpolation='bilinear')
plt.title("Most Frequent Words in the Excellent Review Text")
plt.axis("off")
plt.show()
wordcloud_good = WordCloud( background_color="white").generate(good)
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud_good, interpolation='bilinear')
plt.title("Most Frequent Words in the Good Review Text")
plt.axis("off")
plt.show()
wordcloud_bad = WordCloud(background_color="white").generate(bad)
plt.imshow(wordcloud_bad, interpolation='bilinear')
plt.title ("Most Frequent Words in the Bad Review Text")
plt.axis("off")
plt.show()
# extracting the stopwords from nltk library
sw = stopwords.words('english')
# displaying the stopwords
np.array(sw)
print("Number of stopwords: ", len(sw))
def stopwords(text):
'''a function for removing the stopword'''
# removing the stop words and lowercasing the selected words
text = [word.lower() for word in text.split() if word.lower() not in sw]
# joining the list of words with space separator
return " ".join(text)
df['reviewtext'] = df['reviewtext'].apply(stopwords)
text_afterstop = " ".join(review for review in df.reviewtext)
print ("There are {} words in the combination of all review after stop words removal.".format(len(text_afterstop)))
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['hotel','one','us','stay','day','us','night','also','room', 'rooms']
stopwords.extend(newStopWords)
print("Number of new stopwords: ", len(stopwords))
df['reviewtext'] = df['reviewtext'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
text_afternewstop = " ".join(review for review in df.reviewtext)
print ("There are {} words in the combination of all review after new stop words removal.".format(len(text_afternewstop)))
freq = pd.Series(' '.join(df['reviewtext']).split()).value_counts()[:10]
freq
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df["tokens"] = df["reviewtext"].apply(tokenizer.tokenize)
df.head(1)
text_final = " ".join(review for review in df.reviewtext)
print ("There are {} words in the combination of all review.".format(len(text_final)))
wordcloud = WordCloud(background_color="white").generate(text_final)
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
#plt.title("Most Frequent Words in all the Review Text after Text Clean up")
plt.axis("off")
plt.show()
excellent_final = " ".join(review for review in df[df["Sentiment"]=="excellent"].reviewtext)
good_final = " ".join(review for review in df[df["Sentiment"]=="good"].reviewtext)
bad_final = " ".join(review for review in df[df["Sentiment"]=="bad"].reviewtext)
wordcloud_excellentfinal = WordCloud( background_color="white").generate(excellent_final)
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud_excellentfinal, interpolation='bilinear')
plt.title ="Most Frequent Words in the Excellent Review Text After Cleanup"
plt.axis("off")
plt.show()
wordcloud_good_final = WordCloud( background_color="white").generate(good_final)
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud_good_final, interpolation='bilinear')
plt.title ="Most Frequent Words in the Good Review Text After Cleanup"
plt.axis("off")
plt.show()
wordcloud_bad_final = WordCloud( background_color="white").generate(bad_final)
# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud_bad_final, interpolation='bilinear')
plt.title ="Most Frequent Words in the Bad Review Text After Cleanup"
plt.axis("Off")
plt.show()
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
all_words = [word for tokens in df["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in df["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 10))
plt.xlabel('Sentence length')
plt.ylabel('Number of sentences')
plt.hist(sentence_lengths)
plt.show()
df['num_words'] = df.reviewtext.apply(lambda x : len(x.split()))
df['num_words'].describe()
bins=[0,50,75, np.inf]
df['bins']=pd.cut(df.num_words, bins=[0,100,200,300,400,500, np.inf], labels=['0-100', '100-200', '200-000','300-400','400-500' ,'>500'])
df.sample(1)
word_distribution = df.groupby('bins').size().reset_index().rename(columns={0:'counts'})
word_distribution.head(10)
sns.barplot(x='bins', y='counts', data=word_distribution).set_title("Word distribution per bin")
def length(text):
return len(text)
df['length'] = df['reviewtext'].apply(len)
df.head(1)
df['length'].describe()
bad_sentiment = df[df['Sentiment'] == 'bad']
excellent_sentiment = df[df['Sentiment'] == 'excellent']
good_sentiment = df[df['Sentiment'] == 'good']
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
bins = 500
plt.hist(good_sentiment['length'], alpha = 1.0, bins=bins, label='GOOD')
plt.hist(excellent_sentiment['length'], alpha = 0.9, bins=bins, label='EXCELLENT')
plt.hist(bad_sentiment['length'],alpha = 0.8 , bins=bins, label='BAD')
plt.xlabel('length')
plt.ylabel('numbers')
plt.legend(loc='upper right')
plt.xlim(0,500)
plt.grid()
plt.show()
freq = pd.Series(' '.join(df['reviewtext']).split()).value_counts()[:10]
freq
df['word_count'] = df['reviewtext'].apply(lambda x: len(str(x).split(" ")))
df[['reviewtext','word_count']].head(1)
df['char_count'] = df['reviewtext'].str.len() ## this also includes spac
df[['reviewtext','char_count']].head(1)
df['label'] = df['Sentiment'].map({'bad': 0, 'good': 1, 'excellent':2})
# Pre-Processing
SIA = SentimentIntensityAnalyzer()
df["reviewtext"]= df["reviewtext"].astype(str)
# Applying Model, Variable Creation
df['Polarity Score']=df["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['compound'])
df['Neutral Score']=df["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['neu'])
df['Negative Score']=df["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['neg'])
df['Positive Score']=df["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['pos'])
# Converting 0 to 1 Decimal Score to a Categorical Variable
df['Rating']=''
df.loc[df['Polarity Score']>0,'Rating']='Positive'
df.loc[df['Polarity Score']==0,'Rating']='Neutral'
df.loc[df['Polarity Score']<0,'Rating']='Negative'
df.head(1)
def percentstandardize_barplot(x,y,hue, data, ax=None, order= None):
"""
Standardize by percentage the data using pandas functions, then plot using Seaborn.
Function arguments are and extention of Seaborns'.
"""
sns.barplot(x= x, y=y, hue=hue, ax=ax, order=order,
data=(data[[x, hue]]
.reset_index(drop=True)
.groupby([x])[hue]
.value_counts(normalize=True)
.rename('Percentage').mul(100)
.reset_index()
.sort_values(hue)))
#plt.title("Percentage Frequency of {} by {}".format(hue,x))
plt.ylabel("Percentage %")
import string
huevar = "Sentiment"
xvar = "Rating"
f, axes = plt.subplots(1,2,figsize=(12,5))
sns.countplot(x=xvar, hue=huevar,data=df, ax=axes[0], order=["Negative","Neutral","Positive"])
axes[0].set_title("Occurence of {}\nby {}".format(xvar, huevar))
axes[0].set_ylabel("Count")
percentstandardize_barplot(x=xvar,y="Percentage", hue=huevar,data=df, ax=axes[1])
axes[1].set_title("Percentage Normalized Occurence of {}\nby {}".format(xvar, huevar))
axes[1].set_ylabel("% Percentage by {}".format(huevar))
plt.show()
f, axes = plt.subplots(2,2, figsize=[12,12])
sns.countplot(x="Rating", data=df, ax=axes[0,0], order=["Negative","Neutral","Positive"])
axes[0,0].set_xlabel("Rating")
axes[0,0].set_ylabel("Count")
axes[0,0].set_title("Overall Rating Occurrence")
sns.countplot(x="Sentiment", data=df, ax=axes[0,1])
axes[0,1].set_xlabel("Sentiment")
axes[0,1].set_ylabel("")
axes[0,1].set_title("Overall Sentiment Occurrence")
percentstandardize_barplot(x="Sentiment",y="Percentage",hue="Rating",data=df, ax=axes[1,0])
axes[1,0].set_xlabel("Sentiment")
axes[1,0].set_ylabel("Percentage %")
axes[1,0].set_title("Standardized Percentage Sentiment Frequency\nby Rating")
percentstandardize_barplot(x="Rating",y="Percentage",hue="Sentiment",data=df, ax=axes[1,1])
axes[1,1].set_ylabel("Occurrence Frequency")
axes[1,1].set_title("Standardized Percentage Rating Frequency\nby Sentiment")
axes[1,1].set_xlabel("Rating")
axes[1,1].set_ylabel("")
f.suptitle("Distribution of Rating Score and Sentiment for Hotel Reviews", fontsize=16)
f.tight_layout()
f.subplots_adjust(top=0.92)
plt.show()
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df["tokens_after"] = df["reviewtext"].apply(tokenizer.tokenize)
df.head(1)
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['stemmedtext']= df['reviewtext'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df[['reviewtext','stemmedtext']].sample(1)
from textblob import Word
df['lemmatizedtext'] = df['reviewtext'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df[['reviewtext','lemmatizedtext']].sample(1)
df[['reviewtext','stemmedtext','lemmatizedtext']].sample(1)
#from textblob import Word
#df['reviewtext'] = df['reviewtext'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#df[['reviewtext','lemmatizedtext']].head(3)
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['reviewtext']= df['reviewtext'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df.to_csv("Hotels_PreProcessed_stemmed.csv")
df.head(1)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['stemmedtext'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_counts, df['Sentiment'], test_size=0.3, random_state=1)
from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Count Vectorizer Accuracy:",metrics.accuracy_score(y_test, predicted))
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['reviewtext'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
text_tf, df['Sentiment'], test_size=0.3, random_state=123)
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Tf-IDF Vectorizer Accuracy:",metrics.accuracy_score(y_test, predicted))
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
tuned_parameters = {
'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
'tfidf__use_idf': (True, False),
'tfidf__norm': ('l1', 'l2'),
'clf__alpha': [1, 1e-1, 1e-2]
}
x_train, x_test, y_train, y_test = train_test_split(df['reviewtext'], df['Sentiment'], test_size=0.33, random_state=42)
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring='accuracy')
clf.fit(x_train, y_train)
print(classification_report(y_test, clf.predict(x_test), digits=4))
print(accuracy_score(y_test, clf.predict(x_test)))
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['stemmedtext'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_counts, df['Sentiment'], test_size=0.3, random_state=1)
from sklearn.linear_model import LogisticRegression
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = LogisticRegression().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Logistic Regression Count Vectorizer Accuracy:",metrics.accuracy_score(y_test, predicted))
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(df['reviewtext'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
text_tf, df['Sentiment'], test_size=0.3, random_state=123)
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = LogisticRegression().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Logistic Regression Tf-IDF Vectorizer Accuracy:",metrics.accuracy_score(y_test, predicted))
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
predicted= grid.predict(X_test)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Model Generation Using Multinomial Naive Bayes
clf = LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False).fit(X_train, y_train)
predicted= clf.predict(X_test)
print("Logistic Regression Grid Search Accuracy:",metrics.accuracy_score(y_test, predicted))
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def cv(data):
count_vectorizer = CountVectorizer()
emb = count_vectorizer.fit_transform(data)
return emb, count_vectorizer
list_corpus = df["reviewtext"].tolist()
list_labels = df["Sentiment"].tolist()
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2,
random_state=40)
X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_counts, y_train)
y_predicted_logistic = clf.predict(X_test_counts)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
def get_metrics(y_test, y_predicted):
# true positives / (true positives+false positives)
precision = precision_score(y_test, y_predicted, pos_label=None,
average='weighted')
# true positives / (true positives + false negatives)
recall = recall_score(y_test, y_predicted, pos_label=None,
average='weighted')
# harmonic mean of precision and recall
f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
# true positives + true negatives/ total
accuracy = accuracy_score(y_test, y_predicted)
return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_logistic)
print("Logistic Regression accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_counts, y_train)
y_predicted_logistic = clf.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_logistic)
print("Logistic Regression accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)
y_predicted_NB = clf.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_NB)
print("Multinominal NB accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train_counts,y_train)
clf.score(X_test_counts,y_test)
y_pred_SGD = clf.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_SGD)
print("SGD Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators = 501,
criterion = 'entropy')
model_rf.fit(X_train_counts, y_train)
y_pred_rf = model_rf.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_rf)
print("Random Forest Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.ensemble import BaggingClassifier
model_bag = BaggingClassifier()
model_bag.fit(X_train_counts, y_train)
y_pred_bag = model_bag.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_bag)
print("Bagging Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.ensemble import GradientBoostingClassifier
model_GB = GradientBoostingClassifier()
model_GB.fit(X_train_counts, y_train)
y_pred_GB = model_GB.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_GB)
print("Gradient Boosting Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.ensemble import AdaBoostClassifier
model_ada = AdaBoostClassifier()
model_ada.fit(X_train_counts, y_train)
y_pred_ada = model_ada.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_ada)
print("Ada Boost Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(X_train_counts, y_train)
y_pred_xgb = model_xgb.predict(X_test_counts)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_xgb)
print("XGBoost Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
def tfidf(data):
tfidf_vectorizer = TfidfVectorizer()
train = tfidf_vectorizer.fit_transform(data)
return train, tfidf_vectorizer
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
tfidf_lr = LogisticRegression()
tfidf_lr.fit(X_train_tfidf, y_train)
y_predicted_logistic = tfidf_lr.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_logistic)
print("TF-IDF Logistic Regression accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)
y_predicted_NB = clf.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_NB)
print("Multinominal NB accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train_tfidf,y_train)
clf.score(X_test_tfidf,y_test)
y_pred_SGD = clf.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_SGD)
print("SGD Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
model_rf = RandomForestClassifier(n_estimators = 501,
criterion = 'entropy')
model_rf.fit(X_train_tfidf, y_train)
y_pred_rf = model_rf.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_rf)
print("Random Forest Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
model_bag = BaggingClassifier()
model_bag.fit(X_train_tfidf, y_train)
y_pred_bag = model_bag.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_bag)
print("Bagging Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.ensemble import GradientBoostingClassifier
model_GB = GradientBoostingClassifier()
model_GB.fit(X_train_tfidf, y_train)
y_pred_GB = model_GB.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_GB)
print("Gradient Boosting Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.ensemble import AdaBoostClassifier
model_ada = AdaBoostClassifier()
model_ada.fit(X_train_tfidf, y_train)
y_pred_ada = model_ada.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_ada)
print("Ada Boost Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(X_train_tfidf, y_train)
y_pred_xgb = model_xgb.predict(X_test_tfidf)
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_xgb)
print("XGBoost Classifier accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
list_corpus = df["reviewtext"].tolist()
list_labels = df["Sentiment"].tolist()
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2,
random_state=40)
X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)
def tfidf(data):
tfidf_vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
train = tfidf_vectorizer.fit_transform(data)
return train, tfidf_vectorizer
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
def tfidf_ngram(data):
tfidf_vectorizer_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
train = tfidf_vectorizer_ngram.fit_transform(data)
return train, tfidf_vectorizer_ngram
X_train_tfidf_ngram, tfidf_vectorizer_ngram = tfidf(X_train)
X_test_tfidf_ngram = tfidf_vectorizer_ngram.transform(X_test)
def tfidf_ngram_character(data):
tfidf_vectorizer_ngram_character = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
train = tfidf_vectorizer_ngram_character.fit_transform(data)
return train, tfidf_vectorizer_ngram_character
X_train_tfidf_ngram_char, tfidf_vectorizer_ngram_character = tfidf(X_train)
X_test_tfidf_ngram_char = tfidf_vectorizer_ngram_character.transform(X_test)
def train_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False):
# fit the training dataset on the classifier
classifier.fit(feature_vector_train, label)
# predict the labels on validation dataset
predictions = classifier.predict(feature_vector_test)
if is_neural_net:
predictions = predictions.argmax(axis = -1)
return metrics.accuracy_score(predictions, y_test)
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_counts, y_train, X_test_counts)
print ("NB, Count Vectors: ", accuracy)
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)
# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)
# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("NB, CharLevel Vectors: ", accuracy)
accuracy = train_model(linear_model.LogisticRegression(), X_train_counts, y_train, X_test_counts)
print ("Logistic, Count Vectors: ", accuracy)
accuracy = train_model(linear_model.LogisticRegression(), X_train_tfidf, y_train, X_test_tfidf)
print ("Logistice, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(linear_model.LogisticRegression(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("Logistic, N-Gram Vectors: ", accuracy)
accuracy = train_model(linear_model.LogisticRegression(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("Logistice, CharLevel Vectors: ", accuracy)
accuracy = train_model(SGDClassifier(), X_train_counts, y_train, X_test_counts)
print ("SVM, Count Vectors: ", accuracy)
accuracy = train_model(SGDClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("SVM, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(SGDClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)
accuracy = train_model(SGDClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("SVM, CharLevel Vectors: ", accuracy)
accuracy = train_model(RandomForestClassifier(), X_train_counts, y_train, X_test_counts)
print ("RFC, Count Vectors: ", accuracy)
accuracy = train_model(RandomForestClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("RFC, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(RandomForestClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("RFC, N-Gram Vectors: ", accuracy)
accuracy = train_model(RandomForestClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("RFC, CharLevel Vectors: ", accuracy)
accuracy = train_model(XGBClassifier(), X_train_counts, y_train, X_test_counts)
print ("XGB, Count Vectors: ", accuracy)
accuracy = train_model(XGBClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("XGB, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(XGBClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("XGB, N-Gram Vectors: ", accuracy)
accuracy = train_model(XGBClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("XGB, CharLevel Vectors: ", accuracy)
X_train_counts, count_vectorizer = cv(X_train)
X_test_counts = count_vectorizer.transform(X_test)
X_train_tfidf, tfidf_vectorizer = tfidf(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_train_tfidf_ngram, tfidf_vectorizer_ngram = tfidf_ngram(X_train)
X_test_tfidf_ngram = tfidf_vectorizer_ngram.transform(X_test)
X_train_tfidf_ngram_char, tfidf_vectorizer_ngram_character = tfidf_ngram_character(X_train)
X_test_tfidf_ngram_char = tfidf_vectorizer_ngram_character.transform(X_test)
### Naive Bayes
accuracy = train_model(MultinomialNB(), X_train_counts, y_train, X_test_counts)
print ("NB, Count Vectors: ", accuracy)
accuracy = train_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(MultinomialNB(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)
accuracy = train_model(MultinomialNB(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("NB, CharLevel Vectors: ", accuracy)
### Logistic
accuracy = train_model(LogisticRegression(), X_train_counts, y_train, X_test_counts)
print ("Logistic, Count Vectors: ", accuracy)
accuracy = train_model(LogisticRegression(), X_train_tfidf, y_train, X_test_tfidf)
print ("Logistic, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(LogisticRegression(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("Logistic, N-Gram Vectors: ", accuracy)
accuracy = train_model(LogisticRegression(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("Logistic, CharLevel Vectors: ", accuracy)
### Support Vector Machine
accuracy = train_model(SGDClassifier(), X_train_counts, y_train, X_test_counts)
print ("SVM, Count Vectors: ", accuracy)
accuracy = train_model(SGDClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("SVM, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(SGDClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)
accuracy = train_model(SGDClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("SVM, CharLevel Vectors: ", accuracy)
### Decision Tree Classifier
accuracy = train_model(DecisionTreeClassifier(), X_train_counts, y_train, X_test_counts)
print ("Decision Tree, Count Vectors: ", accuracy)
accuracy = train_model(DecisionTreeClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("Decision Tree, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(DecisionTreeClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("Decision Tree, N-Gram Vectors: ", accuracy)
accuracy = train_model(DecisionTreeClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("Decision Tree, CharLevel Vectors: ", accuracy)
### Random Forest Classifier
accuracy = train_model(RandomForestClassifier(), X_train_counts, y_train, X_test_counts)
print ("RFC, Count Vectors: ", accuracy)
accuracy = train_model(RandomForestClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("RFC, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(RandomForestClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("RFC, N-Gram Vectors: ", accuracy)
accuracy = train_model(RandomForestClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("RFC, CharLevel Vectors: ", accuracy)
### Bagging Classifier
accuracy = train_model(BaggingClassifier(), X_train_counts, y_train, X_test_counts)
print ("Bagging, Count Vectors: ", accuracy)
accuracy = train_model(BaggingClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("Bagging, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(BaggingClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("Bagging, N-Gram Vectors: ", accuracy)
accuracy = train_model(BaggingClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("Bagging, CharLevel Vectors: ", accuracy)
### Ada Boost classifier
accuracy = train_model(AdaBoostClassifier(), X_train_counts, y_train, X_test_counts)
print ("AdaBoost, Count Vectors: ", accuracy)
accuracy = train_model(AdaBoostClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("AdaBoost, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(AdaBoostClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("AdaBoost, N-Gram Vectors: ", accuracy)
accuracy = train_model(AdaBoostClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("AdaBoost, CharLevel Vectors: ", accuracy)
### Gradient Boosting Classifier
accuracy = train_model(GradientBoostingClassifier(), X_train_counts, y_train, X_test_counts)
print ("Gradient Boosting, Count Vectors: ", accuracy)
accuracy = train_model(GradientBoostingClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("Gradient Boosting, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(GradientBoostingClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("Gradient Boosting, N-Gram Vectors: ", accuracy)
accuracy = train_model(GradientBoostingClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("Gradient Boosting, CharLevel Vectors: ", accuracy)
### Boosting Classifier (XGB)
accuracy = train_model(XGBClassifier(), X_train_counts, y_train, X_test_counts)
print ("XGB, Count Vectors: ", accuracy)
accuracy = train_model(XGBClassifier(), X_train_tfidf, y_train, X_test_tfidf)
print ("XGB, WordLevel TF-IDF: ", accuracy)
accuracy = train_model(XGBClassifier(), X_train_tfidf_ngram, y_train, X_test_tfidf_ngram)
print ("XGB, N-Gram Vectors: ", accuracy)
accuracy = train_model(XGBClassifier(), X_train_tfidf_ngram_char, y_train, X_test_tfidf_ngram_char)
print ("XGB, CharLevel Vectors: ", accuracy)
#sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error,confusion_matrix, precision_score, recall_score, auc,roc_curve
from sklearn import ensemble, linear_model, neighbors, svm, tree, neural_network
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn import svm,model_selection, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
#load package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from math import sqrt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
Model = [
#Ensemble Methods
ensemble.AdaBoostClassifier(),
ensemble.BaggingClassifier(),
ensemble.GradientBoostingClassifier(),
ensemble.RandomForestClassifier(),
#GLM
linear_model.LogisticRegression(),
#Navies Bayes
naive_bayes.MultinomialNB(),
#Nearest Neighbor
neighbors.KNeighborsClassifier(),
#SVM
svm.LinearSVC(),
#Trees
tree.DecisionTreeClassifier(),
xgboost.XGBClassifier()
]
model_columns = []
model_compare = pd.DataFrame(columns = model_columns)
row_index = 0
for alg in Model:
predicted = alg.fit(X_train_counts, y_train).predict(X_test_counts)
#fp, tp, th = roc_curve(y_test, predicted)
model_name = alg.__class__.__name__
model_compare.loc[row_index,'Count Vector Model Name'] = model_name
model_compare.loc[row_index, 'Count Vector Model Train Accuracy'] = round(alg.score(X_train_counts, y_train), 4)
model_compare.loc[row_index, 'Count Vector Model Test Accuracy'] = round(alg.score(X_test_counts, y_test), 4)
model_compare.loc[row_index, 'Count Vector Model Precission'] = precision_score(y_test, predicted, average='micro')
model_compare.loc[row_index, 'Count Vectore Model Recall'] = recall_score(y_test, predicted, average='micro')
row_index+=1
model_compare.sort_values(by = ['Count Vector Model Test Accuracy'], ascending = False, inplace = True)
model_compare
plt.subplots(figsize=(16,6))
sns.barplot(x="Count Vector Model Name", y="Count Vector Model Test Accuracy",data=model_compare,palette='hot',edgecolor=sns.color_palette('dark',7))
plt.xticks(rotation=90)
#plt.title('Count Vector Models Test Accuracy Comparison')
plt.show()
model_columns = []
model_compare = pd.DataFrame(columns = model_columns)
row_index = 0
for alg in Model:
predicted = alg.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
#fp, tp, th = roc_curve(y_test, predicted)
model_name = alg.__class__.__name__
model_compare.loc[row_index,'TF-IDF Vector Model Name'] = model_name
model_compare.loc[row_index, 'TF-IDF Vector Model Train Accuracy'] = round(alg.score(X_train_tfidf, y_train), 4)
model_compare.loc[row_index, 'TF-IDF Vector Model Test Accuracy'] = round(alg.score(X_test_tfidf, y_test), 4)
model_compare.loc[row_index, 'TF-IDF Vector Model Precission'] = precision_score(y_test, predicted, average='micro')
model_compare.loc[row_index, 'TF-IDF Vectore Model Recall'] = recall_score(y_test, predicted, average='micro')
row_index+=1
model_compare.sort_values(by = ['TF-IDF Vector Model Test Accuracy'], ascending = False, inplace = True)
model_compare
plt.subplots(figsize=(16,6))
sns.barplot(x="TF-IDF Vector Model Name", y="TF-IDF Vector Model Test Accuracy",data=model_compare,palette='cool',edgecolor=sns.color_palette('dark',7))
plt.xticks(rotation=90)
#plt.title('TF-IDF Vector Models Test Accuracy Comparison')
plt.show()
# Basic packages
import pandas as pd
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from pathlib import Path
# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers
NB_WORDS = 10000 # Parameter indicating the number of words we'll put in the dictionary
NB_START_EPOCHS = 20 # Number of epochs we usually start to train with
BATCH_SIZE = 512 # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 20 # Maximum number of words in a sequence
def deep_model(model, X_train, y_train, X_valid, y_valid):
'''
Function tol : model with the chosen architecture
X_train : training features
y_train : training target
X_valid : validation features
Y_valid : validation target
Output:
model training history
'''
model.compile(optimizer='rmsprop'
, loss='categorical_crossentropy'
, metrics=['accuracy'])
history = model.fit(X_train
, y_train
, epochs=NB_START_EPOCHS
, batch_size=BATCH_SIZE
, validation_data=(X_valid, y_valid)
, verbose=0)
return history
def eval_metric(model, history, metric_name): #train a multi-class model. The number of epochs and
#batch_size are set by the constants at the top of the
#notebook.
#Parameters:
# mode
'''
Function to evaluate a trained model on a chosen metric.
Training and validation metric are plotted in a
line chart for each epoch.
Parameters:
history : model training history
metric_name : loss or accuracy
Output:
line chart with epochs of x-axis and metric on
y-axis
'''
metric = history.history[metric_name]
val_metric = history.history['val_' + metric_name]
e = range(1, NB_START_EPOCHS + 1)
plt.plot(e, metric, 'bo', label='Train ' + metric_name)
plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
plt.xlabel('Epoch number')
plt.ylabel(metric_name)
#plt.title('Comparing training and validation ' + metric_name + ' for ' + model.name)
plt.legend()
plt.show()
def test_model(model, X_train, y_train, X_test, y_test, epoch_stop):
'''
Function to test the model on new data after training it
on the full training data with the optimal number of epochs.
Parameters:
model : trained model
X_train : training features
y_train : training target
X_test : test features
y_test : test target
epochs : optimal number of epochs
Output:
test accuracy and test loss
'''
model.fit(X_train
, y_train
, epochs=epoch_stop
, batch_size=BATCH_SIZE
, verbose=0)
results = model.evaluate(X_test, y_test)
print()
print('Test accuracy: {0: .2f}%'.format(results[1]*100))
return results
def remove_stopwords(input_text):
'''
Function to remove English stopwords from a Pandas Series.
Parameters:
input_text : text to clean
Output:
cleaned Pandas Series
'''
stopwords_list = stopwords.words('english')
# Some words which might indicate a certain sentiment are kept via a whitelist
whitelist = ["n't", "not", "no"]
words = input_text.split()
clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
return " ".join(clean_words)
def remove_mentions(input_text):
'''
Function to remove mentions, preceded by @, in a Pandas Series
Parameters:
input_text : text to clean
Output:
cleaned Pandas Series
'''
return re.sub(r'@\w+', '', input_text)
def compare_models_by_metric(model_1, model_2, model_hist_1, model_hist_2, metric):
'''
Function to compare a metric between two models
Parameters:
model_hist_1 : training history of model 1
model_hist_2 : training history of model 2
metrix : metric to compare, loss, acc, val_loss or val_acc
Output:
plot of metrics of both models
'''
metric_model_1 = model_hist_1.history[metric]
metric_model_2 = model_hist_2.history[metric]
e = range(1, NB_START_EPOCHS + 1)
metrics_dict = {
'acc' : 'Training Accuracy',
'loss' : 'Training Loss',
'val_acc' : 'Validation accuracy',
'val_loss' : 'Validation loss'
}
metric_label = metrics_dict[metric]
plt.plot(e, metric_model_1, 'bo', label=model_1.name)
plt.plot(e, metric_model_2, 'b', label=model_2.name)
plt.xlabel('Epoch number')
plt.ylabel(metric_label)
plt.title = "'Comparing ' + metric_label + ' between models"
plt.legend()
plt.show()
def optimal_epoch(model_hist):
'''
Function to return the epoch number where the validation loss is
at its minimum
Parameters:
model_hist : training history of model
Output:
epoch number with minimum validation loss
'''
min_epoch = np.argmin(model_hist.history['val_loss']) + 1
print("Minimum validation loss reached in epoch {}".format(min_epoch))
return min_epoch
Here I am shuffling the data before splitting into train and test , this is the way we can have Sentiment classes are equally distributed over the trian and test. We will keep the reviewtext column as input and Sentiment columns as output
df = pd.read_csv("Train-1554810061973.csv")
df = df.reindex(np.random.permutation(df.index))
df = df[['reviewtext', 'Sentiment']]
df.reviewtext = df.reviewtext.apply(remove_stopwords).apply(remove_mentions)
X_train, X_test, y_train, y_test = train_test_split(df.reviewtext, df.Sentiment, test_size=0.15, random_state=42)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]
tk = Tokenizer(num_words=NB_WORDS,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
char_level=False,
split=' ')
tk.fit_on_texts(X_train)
print('Fitted tokenizer on {} documents'.format(tk.document_count))
print('{} words in dictionary'.format(tk.num_words))
print('Top 5 most common words are:', collections.Counter(tk.word_counts).most_common(5))
X_train_oh = tk.texts_to_matrix(X_train, mode='binary')
X_test_oh = tk.texts_to_matrix(X_test, mode='binary')
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)
print('"{}" is converted into {}'.format(y_train[1], y_train_le[1]))
print('"{}" is converted into {}'.format(y_train_le[1], y_train_oh[1]))
X_train_rest, X_valid, y_train_rest, y_valid = train_test_split(X_train_oh, y_train_oh, test_size=0.1, random_state=37)
assert X_valid.shape[0] == y_valid.shape[0]
assert X_train_rest.shape[0] == y_train_rest.shape[0]
print('Shape of train set:',X_train_rest.shape)
print('Shape of validation set:',X_valid.shape)
First layer : (10000 x 64) + 64 = 640064
Second layer : (64 x 64) + 64 = 4160
Last layer : (64 x 3) + 3 = 195
base_model = models.Sequential()
base_model.add(layers.Dense(64, activation='relu', input_shape=(NB_WORDS,)))
base_model.add(layers.Dense(64, activation='relu'))
base_model.add(layers.Dense(3, activation='softmax'))
base_model.name = 'Baseline model'
base_model.summary()
Fitting the model on the traing data and validating it on the Validation set
No of Epochs are 25 which is predetermined and we will see where the model starts to overfit
base_history = deep_model(base_model, X_train_rest, y_train_rest, X_valid, y_valid)
base_min = optimal_epoch(base_history)
eval_metric(base_model, base_history, 'loss')
Reducing the model by removing one hidden layer and lowering the number of elements in the remaining layer to 16
reduced_model = models.Sequential()
reduced_model.add(layers.Dense(16, activation='relu', input_shape=(NB_WORDS,)))
reduced_model.add(layers.Dense(3, activation='softmax'))
reduced_model.name = 'Reduced model'
reduced_model.summary()
reduced_history = deep_model(reduced_model, X_train_rest, y_train_rest, X_valid, y_valid)
reduced_min = optimal_epoch(reduced_history)
eval_metric(reduced_model, reduced_history, 'loss')
More epochs before the reduced model starts overfitting(in base model it was 2 and now in reduced model it was 7). The validation loss also goes up slower than first base model.
compare_models_by_metric(base_model, reduced_model, base_history, reduced_history, 'val_loss')
When we compare the validation loss of the baseline model, it is clear that the reduced model starts overfitting at a later epoch. The validation loss stays lower much longer than the baseline model
reg_model = models.Sequential()
reg_model.add(layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu', input_shape=(NB_WORDS,)))
reg_model.add(layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu'))
reg_model.add(layers.Dense(3, activation='softmax'))
reg_model.name = 'L1 & L2 Regularization model'
reg_model.summary()
reg_history = deep_model(reg_model, X_train_rest, y_train_rest, X_valid, y_valid)
reg_min = optimal_epoch(reg_history)
eval_metric(reg_model, reg_history, 'loss')
compare_models_by_metric(base_model, reg_model, base_history, reg_history, 'val_loss')
For the regularized model we notice that it starts overfitting in the same epoch as the baseline model. However, the loss increases much slower afterwards.
drop_model = models.Sequential()
drop_model.add(layers.Dense(64, activation='relu', input_shape=(NB_WORDS,)))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(64, activation='relu'))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(3, activation='softmax'))
drop_model.name = 'Dropout layers model'
drop_model.summary()
drop_history = deep_model(drop_model, X_train_rest, y_train_rest, X_valid, y_valid)
drop_min = optimal_epoch(drop_history)
eval_metric(drop_model, drop_history, 'loss')
compare_models_by_metric(base_model, drop_model, base_history, drop_history, 'val_loss')
The model with the Dropout layers starts overfitting later Compared to the baseline model but at the same epoch as reduced model the loss also remains much lower.
base_results = test_model(base_model, X_train_oh, y_train_oh, X_test_oh, y_test_oh, base_min)
reduced_results = test_model(reduced_model, X_train_oh, y_train_oh, X_test_oh, y_test_oh, reduced_min)
reg_results = test_model(reg_model, X_train_oh, y_train_oh, X_test_oh, y_test_oh, reg_min)
drop_results = test_model(drop_model, X_train_oh, y_train_oh, X_test_oh, y_test_oh, drop_min)
HotelClassification/ Train.csv app.py templates/ home.html result.html static/ style.css
import os
os.chdir("D:\PhD")
from IPython.display import Image
Image(filename='running_app.PNG', width = 600, height = 600)
Image(filename='input.PNG', width = 600, height = 600)
Image(filename='predict.PNG', width = 600, height = 600)
existing_hotels = pd.read_csv("ExistingHotels_CustomerVisitsdata-1554810038262.csv", parse_dates=[3])
new_hotels = pd.read_csv("NewHotels_CutstomerVisitsdata-1554810098964.csv", parse_dates=[3])
print ('Existing Hotels data has {0} rows and {1} columns'.format(existing_hotels.shape[0],existing_hotels.shape[1]))
print ('New Hostels data has {0} rows and {1} columns'.format(new_hotels.shape[0],new_hotels.shape[1]))
pd.set_option('max_colwidth', 500)
existing_hotels['AveragePricing'] = existing_hotels['AveragePricing'].str.replace('$', '').astype('int64')
new_hotels['AveragePricing'] = new_hotels['AveragePricing'].str.replace('$', '').astype('int64')
dtype_df = existing_hotels.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df
dtype_df.groupby("Column Type").aggregate('count').reset_index()
unique_counts = pd.DataFrame.from_records([(col, existing_hotels[col].nunique()) for col in existing_hotels.columns],
columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
unique_counts
dates = existing_hotels['Date']
date_expanded = pd.DataFrame({"year": dates.dt.year,
"month": dates.dt.month,
"dayofmonth": dates.dt.day,
"dayofyear": dates.dt.dayofyear,
"week": dates.dt.week,
"weekofyear": dates.dt.weekofyear,
"dayofweek": dates.dt.dayofweek,
"weekday": dates.dt.weekday,
"quarter": dates.dt.quarter,
})
date_expanded['quarter'] = date_expanded['quarter'].map({1: 'Quarter 1', 2: "Quarter 2", 3:'Quarter 3', 4: 'Quarter 4'})
date_expanded['month'] = date_expanded['month'].map({1: 'Jan', 2: "Feb", 3:'Mar', 4: 'Apr',5: 'May', 6: "Jun", 7:'Jul', 8: 'Aug',9: 'Sep', 10: "Oct", 11:'Nov', 12: 'Dec'})
date_expanded['dayofweek'] = date_expanded['dayofweek'].map({0: 'Monday', 1: "Tuesday", 2:'Wednesday', 4: 'Thursday',5: 'Friday', 6: "Saturday", 7:'Sunday'})
date_expanded.head(3)
date_expanded['year'].value_counts().plot.bar(title="No of Reviews given in each Year")
date_expanded['quarter'].value_counts().plot.bar(title= "Overall reviews for each Quarter")
date_expanded['month'].value_counts().plot.bar(title= "Overall reviews in each Month")
existing_hotels['Hotelid'].value_counts().head(10).plot.bar(title= "Top 10 Hotels which got more Reviews",figsize = (16,8))
existing_hotels = pd.concat([existing_hotels, date_expanded], axis=1)
hotelid_avgrating = pd.crosstab(index=existing_hotels["AverageOverallRatingOfHotel"],
columns=[existing_hotels["Hotelid"]]) # Include row and column totals
hotelid_avgrating
hotel_188 = hotelid_avgrating['hotel_188']
hotel_188
hotel_188.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Hotel_188 Overall Rating Distribution" )
plt.xlabel('AverageOverallRatingOfHotel', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
hotelid_year_sentiment = pd.crosstab(index=existing_hotels["AverageOverallRatingOfHotel"],
columns=[existing_hotels["Hotelid"],existing_hotels['year']]) # Include row and column totals
hotelid_year_sentiment
hotel_188_year = hotelid_year_sentiment['hotel_188']
hotel_188_year.plot(kind="bar",
figsize=(12,10),
stacked=False, title= "Hotel_188 Year Wise Average Rating Distribution" )
plt.xlabel('AverageOverallRatingOfHotel', fontsize=18)
plt.ylabel('No Of Reviews', fontsize=16)
existing_hotels.columns
existing_hotels.columns
existing_hotels[existing_hotels['pric'] < 100].sample(100).plot.scatter(x='price', y='points')
existing_hotels = existing_hotels[["Hotelid","reviewtext", "NoOfReaders", "HelpfulToNoOfreaders", "Value_rating", "Rooms_rating", "Location_rating", "Cleanliness_rating", "Checkin_rating", "Service_rating", "Businessservice_rating", "AverageOverallRatingOfHotel", "AveragePricing"]]
new_hotels = new_hotels[["Hotelid","reviewtext", "NoOfReaders", "HelpfulToNoOfreaders", "Value_rating", "Rooms_rating", "Location_rating", "Cleanliness_rating", "Checkin_rating", "Service_rating", "Businessservice_rating", "AverageOverallRatingOfHotel", "AveragePricing"]]
existing_hotels = standardize_text(existing_hotels, "reviewtext")
new_hotels = standardize_text(new_hotels, "reviewtext")
existing_hotels['reviewtext'] = existing_hotels['reviewtext'].apply(remove_punctuation)
new_hotels['reviewtext'] = new_hotels['reviewtext'].apply(remove_punctuation)
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['hotel','one','us','stay','day','us','night','also','room', 'rooms']
stopwords.extend(newStopWords)
existing_hotels['reviewtext'] = existing_hotels['reviewtext'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
new_hotels['reviewtext'] = new_hotels['reviewtext'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
existing_hotels["tokens"] = existing_hotels["reviewtext"].apply(tokenizer.tokenize)
new_hotels["tokens"] = new_hotels["reviewtext"].apply(tokenizer.tokenize)
existing_hotels['num_words'] = existing_hotels.reviewtext.apply(lambda x : len(x.split()))
new_hotels['num_words'] = new_hotels.reviewtext.apply(lambda x : len(x.split()))
bins=[0,50,75, np.inf]
existing_hotels['bins']=pd.cut(existing_hotels.num_words, bins=[0,100,200,300,400,500, np.inf], labels=['0-100', '100-200', '200-000','300-400','400-500' ,'>500'])
new_hotels['bins']=pd.cut(new_hotels.num_words, bins=[0,100,200,300,400,500, np.inf], labels=['0-100', '100-200', '200-000','300-400','400-500' ,'>500'])
word_distribution_existing = existing_hotels.groupby('bins').size().reset_index().rename(columns={0:'counts'})
word_distribution_existing.head(10)
word_distribution_new = new_hotels.groupby('bins').size().reset_index().rename(columns={0:'counts'})
word_distribution_new.head(10)
def length(text):
return len(text)
existing_hotels['length'] = existing_hotels['reviewtext'].apply(len)
new_hotels['length'] = new_hotels['reviewtext'].apply(len)
existing_hotels['word_count'] = existing_hotels['reviewtext'].apply(lambda x: len(str(x).split(" ")))
new_hotels['word_count'] = new_hotels['reviewtext'].apply(lambda x: len(str(x).split(" ")))
existing_hotels['char_count'] = existing_hotels['reviewtext'].str.len() ## this also includes space
new_hotels['char_count'] = new_hotels['reviewtext'].str.len()
# Pre-Processing
import nltk
nltk.download('vader_lexicon')
SIA = SentimentIntensityAnalyzer()
existing_hotels["reviewtext"]= existing_hotels["reviewtext"].astype(str)
# Applying Model, Variable Creation
existing_hotels['Polarity Score']=existing_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['compound'])
existing_hotels['Neutral Score']=existing_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['neu'])
existing_hotels['Negative Score']=existing_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['neg'])
existing_hotels['Positive Score']=existing_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['pos'])
# Converting 0 to 1 Decimal Score to a Categorical Variable
existing_hotels['Rating']=''
existing_hotels.loc[existing_hotels['Polarity Score']>0,'Rating']='Positive'
existing_hotels.loc[existing_hotels['Polarity Score']==0,'Rating']='Neutral'
existing_hotels.loc[existing_hotels['Polarity Score']<0,'Rating']='Negative'
# Pre-Processing
import nltk
nltk.download('vader_lexicon')
SIA = SentimentIntensityAnalyzer()
new_hotels["reviewtext"]= new_hotels["reviewtext"].astype(str)
# Applying Model, Variable Creation
new_hotels['Polarity Score']=new_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['compound'])
new_hotels['Neutral Score']=new_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['neu'])
new_hotels['Negative Score']=new_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['neg'])
new_hotels['Positive Score']=new_hotels["reviewtext"].apply(lambda x:SIA.polarity_scores(x)['pos'])
# Converting 0 to 1 Decimal Score to a Categorical Variable
new_hotels['Rating']=''
new_hotels.loc[new_hotels['Polarity Score']>0,'Rating']='Positive'
new_hotels.loc[new_hotels['Polarity Score']==0,'Rating']='Neutral'
new_hotels.loc[new_hotels['Polarity Score']<0,'Rating']='Negative'
from nltk.stem import PorterStemmer
st = PorterStemmer()
existing_hotels['reviewtext']= existing_hotels['reviewtext'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
new_hotels['reviewtext']= new_hotels['reviewtext'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
existing_hotels.to_csv("Existing_Hotels_Final.csv")
new_hotels.to_csv("New_Hotels_Final.csv")
print(existing_hotels.shape)
print(new_hotels.shape)
existing_hotels1 = existing_hotels.drop(['reviewtext','tokens','bins','length','word_count', 'char_count','Rating'],axis=1)
new_hotels1 = new_hotels.drop(['reviewtext','tokens','bins','length','word_count', 'char_count','Rating'],axis=1)
ratings_vr = existing_hotels1.groupby(['Hotelid'])[['Value_rating']].mean().rename(columns = {'Value_rating':'Mean_value_rating'}).reset_index()
ratings_nr = existing_hotels1.groupby(['Hotelid'])[['NoOfReaders']].mean().rename(columns = {'NoOfReaders':'Mean_No_readers'}).reset_index()
ratings_hnr = existing_hotels1.groupby(['Hotelid'])[['HelpfulToNoOfreaders']].mean().rename(columns = {'HelpfulToNoOfreaders':'Mean_H_No_readers'}).reset_index()
ratings_rr = existing_hotels1.groupby(['Hotelid'])[['Rooms_rating']].mean().rename(columns = {'Rooms_rating':'Mean_rooms_rating'}).reset_index()
ratings_lr = existing_hotels1.groupby(['Hotelid'])[['Location_rating']].mean().rename(columns = {'Location_rating':'Mean_location_rating'}).reset_index()
ratings_cr = existing_hotels1.groupby(['Hotelid'])[['Cleanliness_rating']].mean().rename(columns = {'Cleanliness_rating':'Mean_cleanliess_rating'}).reset_index()
ratings_chr = existing_hotels1.groupby(['Hotelid'])[['Checkin_rating']].mean().rename(columns = {'Checkin_rating':'Mean_checkin_rating'}).reset_index()
ratings_sr = existing_hotels1.groupby(['Hotelid'])[['Service_rating']].mean().rename(columns = {'Service_rating':'Mean_service_rating'}).reset_index()
ratings_bsr = existing_hotels1.groupby(['Hotelid'])[['Businessservice_rating']].mean().rename(columns = {'Businessservice_rating':'Mean_bus_service_rating'}).reset_index()
ratings_avgor = existing_hotels1.groupby(['Hotelid'])[['AverageOverallRatingOfHotel']].mean().rename(columns = {'AverageOverallRatingOfHotel':'Mean_avg_overall_rating'}).reset_index()
ratings_avgpr = existing_hotels1.groupby(['Hotelid'])[['AveragePricing']].mean().rename(columns = {'AveragePricing':'Mean_avg_pricing'}).reset_index()
ratings_nw = existing_hotels1.groupby(['Hotelid'])[['num_words']].mean().rename(columns = {'num_words':'Mean_no_words'}).reset_index()
ratings_ps = existing_hotels1.groupby(['Hotelid'])[['Polarity Score']].mean().rename(columns = {'Polarity Score':'Mean_pol_score'}).reset_index()
ratings_nus = existing_hotels1.groupby(['Hotelid'])[['Neutral Score']].mean().rename(columns = {'Neutral Score':'Mean_neu_score'}).reset_index()
ratings_neg = existing_hotels1.groupby(['Hotelid'])[['Negative Score']].mean().rename(columns = {'Negative Score':'Mean_neg_score'}).reset_index()
ratings_pos = existing_hotels1.groupby(['Hotelid'])[['Positive Score']].mean().rename(columns = {'Positive Score':'Mean_pos_score'}).reset_index()
ratings_vr_new = new_hotels1.groupby(['Hotelid'])[['Value_rating']].mean().rename(columns = {'Value_rating':'Mean_value_rating'}).reset_index()
ratings_nr_new = new_hotels1.groupby(['Hotelid'])[['NoOfReaders']].mean().rename(columns = {'NoOfReaders':'Mean_No_readers'}).reset_index()
ratings_hnr_new = new_hotels1.groupby(['Hotelid'])[['HelpfulToNoOfreaders']].mean().rename(columns = {'HelpfulToNoOfreaders':'Mean_H_No_readers'}).reset_index()
ratings_rr_new = new_hotels1.groupby(['Hotelid'])[['Rooms_rating']].mean().rename(columns = {'Rooms_rating':'Mean_rooms_rating'}).reset_index()
ratings_lr_new = new_hotels1.groupby(['Hotelid'])[['Location_rating']].mean().rename(columns = {'Location_rating':'Mean_location_rating'}).reset_index()
ratings_cr_new = new_hotels1.groupby(['Hotelid'])[['Cleanliness_rating']].mean().rename(columns = {'Cleanliness_rating':'Mean_cleanliess_rating'}).reset_index()
ratings_chr_new = new_hotels1.groupby(['Hotelid'])[['Checkin_rating']].mean().rename(columns = {'Checkin_rating':'Mean_checkin_rating'}).reset_index()
ratings_sr_new = new_hotels1.groupby(['Hotelid'])[['Service_rating']].mean().rename(columns = {'Service_rating':'Mean_service_rating'}).reset_index()
ratings_bsr_new = new_hotels1.groupby(['Hotelid'])[['Businessservice_rating']].mean().rename(columns = {'Businessservice_rating':'Mean_bus_service_rating'}).reset_index()
ratings_avgor_new = new_hotels1.groupby(['Hotelid'])[['AverageOverallRatingOfHotel']].mean().rename(columns = {'AverageOverallRatingOfHotel':'Mean_avg_overall_rating'}).reset_index()
ratings_avgpr_new = new_hotels1.groupby(['Hotelid'])[['AveragePricing']].mean().rename(columns = {'AveragePricing':'Mean_avg_pricing'}).reset_index()
ratings_nw_new = new_hotels1.groupby(['Hotelid'])[['num_words']].mean().rename(columns = {'num_words':'Mean_no_words'}).reset_index()
ratings_ps_new = new_hotels1.groupby(['Hotelid'])[['Polarity Score']].mean().rename(columns = {'Polarity Score':'Mean_pol_score'}).reset_index()
ratings_nus_new = new_hotels1.groupby(['Hotelid'])[['Neutral Score']].mean().rename(columns = {'Neutral Score':'Mean_neu_score'}).reset_index()
ratings_neg_new = new_hotels1.groupby(['Hotelid'])[['Negative Score']].mean().rename(columns = {'Negative Score':'Mean_neg_score'}).reset_index()
ratings_pos_new = new_hotels1.groupby(['Hotelid'])[['Positive Score']].mean().rename(columns = {'Positive Score':'Mean_pos_score'}).reset_index()
ratings_nr_new.shape
print(existing_hotels1.shape)
print(new_hotels1.shape)
existing_hotels1.head()
existing_hotels1.dtypes
existing_hotels1 = existing_hotels1.convert_objects(convert_numeric=True)
new_hotels1 = new_hotels1.convert_objects(convert_numeric=True)
print(existing_hotels1.shape)
print(new_hotels1.shape)
df =existing_hotels1
df_list = [ratings_vr, ratings_rr, ratings_lr, ratings_cr, ratings_chr, ratings_sr, ratings_bsr, ratings_avgor,ratings_avgpr,ratings_nw,ratings_ps,ratings_nus,ratings_neg,ratings_pos]
df = df_list[0]
for df_ in df_list[1:]:
df = df.merge(df_, on='Hotelid')
existing_hotels_final = df
existing_hotels_final.head()
df1 =new_hotels1
df1_list = [ratings_vr_new, ratings_rr_new, ratings_lr_new, ratings_cr_new, ratings_chr_new, ratings_sr_new, ratings_bsr_new, ratings_avgor_new,ratings_avgpr_new,ratings_nw_new,ratings_ps_new,ratings_nus_new,ratings_neg_new,ratings_pos_new]
df1 = df1_list[0]
for df1_ in df1_list[1:]:
df1 = df1.merge(df1_, on='Hotelid')
new_hotels_final = df1
print(existing_hotels_final.shape)
print(new_hotels_final.shape)
existing_avg_ratings = existing_hotels_final
new_avg_ratings =new_hotels_final
existing_avg_ratings = existing_avg_ratings.drop(['Hotelid'], axis=1)
new_avg_ratings = new_avg_ratings.drop(['Hotelid'], axis = 1)
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
# Create the scaler object with a range of 0-1
scaler = StandardScaler()
# Fit on the training data
scaler.fit(existing_avg_ratings)
# Transform both the training and testing data
existing_avg_ratings1 = scaler.transform(existing_avg_ratings)
scaler.fit(new_avg_ratings)
# Transform both the training and testing data
new_avg_ratings1 = scaler.transform(new_avg_ratings)
from sklearn.cluster import KMeans
wcss = []
for i in range(1,14):
kmeans = KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
kmeans.fit(existing_avg_ratings1)
wcss.append(kmeans.inertia_)
plt.plot(range(1,14),wcss)
#plt.title("The Elbow Method")
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
existing_avg_ratings2 = pd.DataFrame(existing_avg_ratings1)
# Applying k-means to the hotels dataset
kmeans = KMeans(n_clusters=3,init='k-means++',max_iter=300,n_init=10,random_state=0)
y_kmeans = kmeans.fit_predict(existing_avg_ratings2)
existing_avg_ratings2 = existing_avg_ratings2.as_matrix(columns=None)
Hotelid = existing_hotels_final['Hotelid']
result = pd.DataFrame({'Hotelid':Hotelid, 'cluster_id' : y_kmeans})
result.shape
result.tail()
# Visualising the clusters
plt.scatter(existing_avg_ratings2[y_kmeans == 0, 0], existing_avg_ratings2[y_kmeans == 0,1],s=100,c='blue',label= 'good')
plt.scatter(existing_avg_ratings2[y_kmeans == 1, 0], existing_avg_ratings2[y_kmeans == 1,1],s=100,c='red',label='bad')
plt.scatter(existing_avg_ratings2[y_kmeans == 2, 0], existing_avg_ratings2[y_kmeans == 2,1],s=100,c='green',label='Excellent')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=300,c='yellow',label='Centroids')
#plt.title('Clusters of hotels')
plt.legend()
plt.show()
new_avg_ratings2 = pd.DataFrame(new_avg_ratings1)
new_kmeans = kmeans.predict(new_avg_ratings2)
new_avg_ratings2 = new_avg_ratings2.as_matrix(columns=None)
new_Hotelid = new_hotels_final['Hotelid']
result_new = pd.DataFrame({'Hotelid':new_Hotelid, 'cluster_id' : new_kmeans})
result_new.shape
result_new.head()
# Visualising the clusters
plt.scatter(new_avg_ratings2[new_kmeans == 0, 0], new_avg_ratings2[new_kmeans == 0,1],s=100,c='blue',label= 'good')
plt.scatter(new_avg_ratings2[new_kmeans == 1, 0], new_avg_ratings2[new_kmeans == 1,1],s=100,c='red',label='bad')
plt.scatter(new_avg_ratings2[new_kmeans == 2, 0], new_avg_ratings2[new_kmeans == 2,1],s=100,c='green',label='Excellent')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=300,c='yellow',label='Centroids')
#plt.title('Clusters of hotels')
plt.legend()
plt.show()
import pandas as pd
import os
import numpy as np
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate
import warnings; warnings.simplefilter('ignore')
os.chdir("D:\PhD")
ratings = pd.read_csv("user_hotel_rating-1555730075105.csv")
ratings.head()
print("Total Data:")
print("Total number of hotel ratings = "+str(ratings.shape[0]))
print("Number of unique user reviews = "+str(len(np.unique(ratings["userid"]))))
print("Number of unique hotels = "+str(len(np.unique(ratings["Hotelid"]))))
rating_count = ratings.groupby('Hotelid')['OverallRating'].count().reset_index().sort_values('OverallRating', ascending=False)
rating_count.head()
plt.rc("font", size=15)
ratings.OverallRating.value_counts(sort=False).plot(kind='bar')
plt.title('Rating Distribution\n')
plt.xlabel('OverallRating')
plt.ylabel('Count')
plt.savefig('system1.png', bbox_inches='tight')
plt.show()
average_rating = pd.DataFrame(ratings.groupby('Hotelid')['OverallRating'].mean())
average_rating['ratingCount'] = pd.DataFrame(ratings.groupby('Hotelid')['OverallRating'].count())
average_rating.sort_values('ratingCount', ascending=False).head()
rating_count_user = pd.DataFrame(ratings.groupby('userid')['OverallRating'].count())
rating_count_user.sort_values('OverallRating', ascending=False).head()
ratings.head(2)
Out of two recommender systems, here I will be using the item based because I am considering that a user based system could be influenced by the change of hotel taste in the time by people and also because having less hotels than users, will fasten our calculations.
Before calculate all correlations and prepare our data for it, let’s make quick considerations: with this model we do not have a mathematical way to calculate the accuracy of the model, but we can try to use the common sense and intuition. For example one thing that we can do is to consider if we have in our list hotels that we know already can be correlated.
For example let us consider the hotel "hotel_608" which got more ratings and overall avg rating is 3.5. The idea is that if someone visited this hotel and rated high, I would expect he also likes other hotels which are having hight overall ratings. In this case hotel_557 has secong highest overall rating.
ratings = ratings.pivot_table(index = 'userid', columns='Hotelid', values= 'OverallRating')
ratings.head()
X = ratings["hotel_608"]
X.head()
corr = ratings.corrwith(X)
corr.sort_values(ascending=False).head()
matrix_corr = ratings.corr(method='pearson', min_periods=100)
matrix_corr.head()
matrix_corr.shape
ratings.iloc[0].dropna()
# Lets create a series that will contain all the correlations
user_corr = pd.Series()
userid = 1
for Hotelid in ratings.iloc[userid].dropna().index:
corr_list = matrix_corr[Hotelid].dropna()*ratings.iloc[userid][Hotelid]
user_corr = user_corr.append(corr_list)
user_corr.head()
corr_list.head()
user_corr = user_corr.groupby(user_corr.index).sum()
user_corr.head()
hotel_list = []
for i in range(len(ratings.iloc[userid].dropna().index)):
if ratings.iloc[userid].dropna().index[i] in user_corr:
hotel_list.append(ratings.iloc[userid].dropna().index[i])
else:
pass
user_corr = user_corr.drop(hotel_list)
print("Dear User, Based on the Hotels you have visited: \n")
for i in ratings.iloc[userid].dropna().index:
print(i)
print("\n We would suggest you to visit below Hotels \n")
for i in user_corr.sort_values(ascending=False).index[:7]:
print(i)
rating = pd.read_csv("user_hotel_rating-1555730075105.csv")
rating.columns = ['userid', 'Hotelid', 'OverallRating']
import os
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
import surprise
df = rating
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['userid', 'Hotelid', 'OverallRating']], reader)
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly()]:
# Perform cross validation
results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
# Get results & append algorithm name
tmp = pd.DataFrame.from_dict(results).mean(axis=0)
tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
benchmark.append(tmp)
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results
bsl_options = {'method': 'als',
'n_epochs': 5,
'reg_u': 12,
'reg_i': 5
}
algo = KNNBaseline(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)
trainset, testset = train_test_split(data, test_size=0.20)
algo = KNNBaseline(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)
predictions[2]
def get_Iu(uid):
""" return the number of items rated by given user
args:
uid: the id of the user
returns:
the number of items rated by the user
"""
try:
return len(trainset.ur[trainset.to_inner_uid(uid)])
except ValueError: # user was not part of the trainset
return 0
def get_Ui(iid):
""" return number of users that have rated given item
args:
iid: the raw id of the item
returns:
the number of users that have rated the item.
"""
try:
return len(trainset.ir[trainset.to_inner_iid(iid)])
except ValueError:
return 0
df = pd.DataFrame(predictions, columns=['userid', 'Hotelid', 'OverallRating', 'Predictions', 'details'])
df['Iu'] = df.userid.apply(get_Iu)
df['Ui'] = df.Hotelid.apply(get_Ui)
df['err'] = abs(df.Predictions - df.OverallRating)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]
best_predictions
worst_predictions
ratings = pd.read_csv("user_hotel_rating-1555730075105.csv")
# Removing user_ Sign from userid column and hotel_ from hotelid columns
ratings['Hotelid'] = ratings['Hotelid'].str.replace('hotel_', '').astype('int64')
ratings['userid'] = ratings['userid'].str.replace('user_', '').astype('int64')
ratings.head(2)
n_users = ratings.userid.unique().shape[0]
n_hotels = ratings.Hotelid.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of Hotels = ' + str(n_hotels))
Ratings = ratings.pivot(index = 'userid', columns ='Hotelid', values = 'OverallRating').fillna(0)
Ratings.head(2)
Ratings.to_csv("Ratings.csv")
Ratings1 = pd.read_csv("Ratings1.csv")
Ratings1.head(2)
R = Ratings.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 50)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()
def recommend_hotels(predictions, userid, Ratings1, original_ratings, num_recommendations):
# Get and sort the user's predictions
user_row_number = Ratings1.loc[Ratings1['userid'] == userid, 'Row_no'].iloc[0] - 1 # Userid starts at 1, not 0
sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
# Get the user's data and merge in the hotel information.
user_data = original_ratings[original_ratings.userid == (userid)]
user_full = (user_data.sort_values(['OverallRating'], ascending=False)
)
print(user_full.head(2))
print('User {0} has already rated {1} hotels.'.format(userid, user_full.shape[0]))
print('Recommending highest {0} predicted ratings hotels not already rated.'.format(num_recommendations))
# Recommend the highest predicted rating movies that the user hasn't seen yet.
recommendations = (ratings[~ratings['Hotelid'].isin(user_full['Hotelid'])].
merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
left_on = 'Hotelid',
right_on = 'Hotelid').
rename(columns = {user_row_number: 'Predictions'}).
sort_values('Predictions', ascending = False).drop_duplicates('Hotelid').drop(['userid'],axis=1).iloc[:num_recommendations]
)
return user_full, recommendations
already_rated, predictions = recommend_hotels(preds, 78131, Ratings1, ratings, 5)
predictions
already_rated, predictions = recommend_hotels(preds, 21051, Ratings1, ratings, 5)